**********************************
* 3. SIMULATING HUMANITY DATASET *
**********************************

*******************************************************
* CONTENT:	(1) Out-of-sample test (country-years)    *
*			(2) Out-of-sample tests (countries)		  *
*           (3) Predicting socio-demographics         *   
*			(4) Expanding the dataset proportionnally *
*			(5) Creating syntetic respondents         *
*******************************************************

set more off

* (1) Out-of-sample tests (country-years)
*****************************************

* Loading QOG datasets

use /Users/damienbol/Desktop/WPO/WPO_QOG_WVS.dta, clear

* Keeping variables of interest

keep wdi_pop wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr wdi_gdpcappppcur wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 ht_region year country female_mean female_sd edu_mean edu_sd age_mean age_sd urban_mean urban_sd income_mean income_sd corr_female_edu corr_female_age corr_female_urban corr_female_income corr_edu_age corr_edu_urban corr_edu_income corr_age_urban corr_age_income corr_urban_income

* Logging variables that need to be logged, and squaring the year

gen year_sq=year^2
gen wdi_pop_log=log(wdi_pop)
gen gdp_log=log(wdi_gdpcappppcur)

* Creating general inclusion variables

gen inclusion=1
replace inclusion=0 if wdi_pop_log==.
replace inclusion=0 if wdi_popf==.
replace inclusion=0 if wdi_pop14==.
replace inclusion=0 if wdi_pop65==.
replace inclusion=0 if vdem_libdem==.
replace inclusion=0 if vdem_corr==.
replace inclusion=0 if gdp_log==.
replace inclusion=0 if wdi_tele==.
replace inclusion=0 if lp_lat_abst==.
replace inclusion=0 if wdi_popurb==.
replace inclusion=0 if lp_catho80==.
replace inclusion=0 if lp_muslim80==.
replace inclusion=0 if lp_protmg80==.

* Creating inclusion variables for means and standard deviation

gen inclusion_female=1
replace inclusion_female=0 if female_mean==.
replace inclusion_female=0 if inclusion==.

gen inclusion_edu=1
replace inclusion_edu=0 if edu_mean==.
replace inclusion_edu=0 if inclusion==.

gen inclusion_age=1
replace inclusion_age=0 if age_mean==.
replace inclusion_age=0 if inclusion==.

gen inclusion_urban=1
replace inclusion_urban=0 if urban_mean==.
replace inclusion_urban=0 if inclusion==.

gen inclusion_income=1
replace inclusion_income=0 if income_mean==.
replace inclusion_income=0 if inclusion==.

* Out-of-sample test for means

forvalues i = 1(1)21{

gen random_female_mean`i'=runiform() if inclusion_female==1
sort random_female_mean`i'
gen test_female_mean`i'=1 if _n<46

qui reg female_mean wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_female==1  & test_female_mean`i'!=1
predict female_mean_pred_test`i'
replace female_mean_pred_test`i'=. if inclusion_female==0

gen female_mean_difference_test`i'=female_mean-female_mean_pred_test`i' if test_female_mean`i'==1

}


forvalues i = 1(1)21{

gen random_edu_mean`i'=runiform() if inclusion_edu==1
sort random_edu_mean`i'
gen test_edu_mean`i'=1 if _n<46

qui reg edu_mean wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_edu==1  & test_edu_mean`i'!=1
predict edu_mean_pred_test`i'
replace edu_mean_pred_test`i'=. if inclusion_edu==0

gen edu_mean_difference_test`i'=edu_mean-edu_mean_pred_test`i' if test_edu_mean`i'==1

}


forvalues i = 1(1)21{

gen random_age_mean`i'=runiform() if inclusion_age==1
sort random_age_mean`i'
gen test_age_mean`i'=1 if _n<46

qui reg age_mean wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_age==1  & test_age_mean`i'!=1
predict age_mean_pred_test`i'
replace age_mean_pred_test`i'=. if inclusion_age==0

gen age_mean_difference_test`i'=age_mean-age_mean_pred_test`i' if test_age_mean`i'==1

}


forvalues i = 1(1)21{

gen random_urban_mean`i'=runiform() if inclusion_urban==1
sort random_urban_mean`i'
gen test_urban_mean`i'=1 if _n<46


qui reg urban_mean wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_urban==1  & test_urban_mean`i'!=1
predict urban_mean_pred_test`i'
replace urban_mean_pred_test`i'=. if inclusion_urban==0

gen urban_mean_difference_test`i'=urban_mean-urban_mean_pred_test`i' if test_urban_mean`i'==1

}

forvalues i = 1(1)21{

gen random_income_mean`i'=runiform() if inclusion_income==1
sort random_income_mean`i'
gen test_income_mean`i'=1 if _n<46

qui reg income_mean wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_income==1  & test_income_mean`i'!=1
predict income_mean_pred_test`i'
replace income_mean_pred_test`i'=. if inclusion_income==0

gen income_mean_difference_test`i'=income_mean-income_mean_pred_test`i' if test_income_mean`i'==1

}

* Out-of-sample test for standard deviations


forvalues i = 1(1)21{

gen random_female_sd`i'=runiform() if inclusion_female==1
sort random_female_sd`i'
gen test_female_sd`i'=1 if _n<46

qui reg female_sd wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_female==1  & test_female_sd`i'!=1
predict female_sd_pred_test`i'
replace female_sd_pred_test`i'=. if inclusion_female==0

gen female_sd_difference_test`i'=female_sd-female_sd_pred_test`i' if test_female_sd`i'==1

}



forvalues i = 1(1)21{

gen random_edu_sd`i'=runiform() if inclusion_edu==1
sort random_edu_sd`i'
gen test_edu_sd`i'=1 if _n<46

qui reg edu_sd wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_edu==1  & test_edu_sd`i'!=1
predict edu_sd_pred_test`i'
replace edu_sd_pred_test`i'=. if inclusion_edu==0

gen edu_sd_difference_test`i'=edu_sd-edu_sd_pred_test`i' if test_edu_sd`i'==1

}


forvalues i = 1(1)21{

gen random_age_sd`i'=runiform() if inclusion_age==1
sort random_age_sd`i'
gen test_age_sd`i'=1 if _n<46

qui reg age_sd wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_age==1  & test_age_sd`i'!=1
predict age_sd_pred_test`i'
replace age_sd_pred_test`i'=. if inclusion_age==0

gen age_sd_difference_test`i'=age_sd-age_sd_pred_test`i' if test_age_sd`i'==1

}


forvalues i = 1(1)21{

gen random_urban_sd`i'=runiform() if inclusion_urban==1
sort random_urban_sd`i'
gen test_urban_sd`i'=1 if _n<46

qui reg urban_sd wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_urban==1  & test_urban_sd`i'!=1
predict urban_sd_pred_test`i'
replace urban_sd_pred_test`i'=. if inclusion_urban==0

gen urban_sd_difference_test`i'=urban_sd-urban_sd_pred_test`i' if test_urban_sd`i'==1

}

forvalues i = 1(1)21{

gen random_income_sd`i'=runiform() if inclusion_income==1
sort random_income_sd`i'
gen test_income_sd`i'=1 if _n<46

qui reg income_sd wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_income==1  & test_income_sd`i'!=1
predict income_sd_pred_test`i'
replace income_sd_pred_test`i'=. if inclusion_income==0

gen income_sd_difference_test`i'=income_sd-income_sd_pred_test`i' if test_income_sd`i'==1

}

* Creating inclusion variables for correlations

gen inclusion_corr_female_edu=1
replace inclusion_corr_female_edu=0 if female_mean==. | edu_mean==.
replace inclusion_corr_female_edu=0 if inclusion==.

gen inclusion_corr_female_age=1
replace inclusion_corr_female_age=0 if female_mean==. | age_mean==.
replace inclusion_corr_female_age=0 if inclusion==.

gen inclusion_corr_female_urban=1
replace inclusion_corr_female_urban=0 if female_mean==. | urban_mean==.
replace inclusion_corr_female_urban=0 if inclusion==.

gen inclusion_corr_female_income=1
replace inclusion_corr_female_income=0 if female_mean==. | income_mean==.
replace inclusion_corr_female_income=0 if inclusion==.

gen inclusion_corr_edu_age=1
replace inclusion_corr_edu_age=0 if edu_mean==. | age_mean==.
replace inclusion_corr_edu_age=0 if inclusion==.

gen inclusion_corr_edu_urban=1
replace inclusion_corr_edu_urban=0 if edu_mean==. | urban_mean==.
replace inclusion_corr_edu_urban=0 if inclusion==.

gen inclusion_corr_edu_income=1
replace inclusion_corr_edu_income=0 if edu_mean==. | income_mean==.
replace inclusion_corr_edu_income=0 if inclusion==.

gen inclusion_corr_age_urban=1
replace inclusion_corr_age_urban=0 if age_mean==. | urban_mean==.
replace inclusion_corr_age_urban=0 if inclusion==.

gen inclusion_corr_age_income=1
replace inclusion_corr_age_income=0 if age_mean==. | income_mean==.
replace inclusion_corr_age_income=0 if inclusion==.

gen inclusion_corr_urban_income=1
replace inclusion_corr_urban_income=0 if urban_mean==. | income_mean==.
replace inclusion_corr_urban_income=0 if inclusion==.

* Out-of-sample test for correlations


forvalues i = 1(1)21{

gen random_corr_female_edu`i'=runiform() if inclusion_corr_female_edu==1
sort random_corr_female_edu`i'
gen test_corr_female_edu`i'=1 if _n<46

qui reg corr_female_edu wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_corr_female_edu==1  & test_corr_female_edu`i'!=1
predict corr_female_edu_pred_test`i'
replace corr_female_edu_pred_test`i'=. if inclusion_corr_female_edu==0

gen corr_female_edu_diff_test`i'=corr_female_edu-corr_female_edu_pred_test`i' if test_corr_female_edu`i'==1

}


forvalues i = 1(1)21{

gen random_corr_female_age`i'=runiform() if inclusion_corr_female_age==1
sort random_corr_female_age`i'
gen test_corr_female_age`i'=1 if _n<46

qui reg corr_female_age wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_corr_female_age==1  & test_corr_female_age`i'!=1
predict corr_female_age_pred_test`i'
replace corr_female_age_pred_test`i'=. if inclusion_corr_female_age==0

gen corr_female_age_diff_test`i'=corr_female_age-corr_female_age_pred_test`i' if test_corr_female_age`i'==1

}

forvalues i = 1(1)21{

gen random_corr_female_urban`i'=runiform() if inclusion_corr_female_urban==1
sort random_corr_female_urban`i'
gen test_corr_female_urban`i'=1 if _n<46

qui reg corr_female_urban wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_corr_female_urban==1  & test_corr_female_urban`i'!=1
predict corr_female_urban_pred_test`i'
replace corr_female_urban_pred_test`i'=. if inclusion_corr_female_urban==0

gen corr_female_urban_diff_test`i'=corr_female_urban-corr_female_urban_pred_test`i' if test_corr_female_urban`i'==1

}


forvalues i = 1(1)21{

gen random_corr_female_income`i'=runiform() if inclusion_corr_female_income==1
sort random_corr_female_income`i'
gen test_corr_female_income`i'=1 if _n<46

qui reg corr_female_income wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_corr_female_income==1  & test_corr_female_income`i'!=1
predict corr_female_income_pred_test`i'
replace corr_female_income_pred_test`i'=. if inclusion_corr_female_income==0

gen corr_female_income_diff_test`i'=corr_female_income-corr_female_income_pred_test`i' if test_corr_female_income`i'==1

}


forvalues i = 1(1)21{

gen random_corr_edu_age`i'=runiform() if inclusion_corr_edu_age==1
sort random_corr_edu_age`i'
gen test_corr_edu_age`i'=1 if _n<46

qui reg corr_edu_age wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_corr_edu_age==1  & test_corr_edu_age`i'!=1
predict corr_edu_age_pred_test`i'
replace corr_edu_age_pred_test`i'=. if inclusion_corr_edu_age==0

gen corr_edu_age_diff_test`i'=corr_edu_age-corr_edu_age_pred_test`i' if test_corr_edu_age`i'==1

}

forvalues i = 1(1)21{

gen random_corr_edu_urban`i'=runiform() if inclusion_corr_edu_urban==1
sort random_corr_edu_urban`i'
gen test_corr_edu_urban`i'=1 if _n<46

qui reg corr_edu_urban wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_corr_edu_urban==1  & test_corr_edu_urban`i'!=1
predict corr_edu_urban_pred_test`i'
replace corr_edu_urban_pred_test`i'=. if inclusion_corr_edu_urban==0

gen corr_edu_urban_diff_test`i'=corr_edu_urban-corr_edu_urban_pred_test`i' if test_corr_edu_urban`i'==1

}

forvalues i = 1(1)21{

gen random_corr_edu_income`i'=runiform() if inclusion_corr_edu_income==1
sort random_corr_edu_income`i'
gen test_corr_edu_income`i'=1 if _n<46

qui reg corr_edu_income wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_corr_edu_income==1  & test_corr_edu_income`i'!=1
predict corr_edu_income_pred_test`i'
replace corr_edu_income_pred_test`i'=. if inclusion_corr_edu_income==0

gen corr_edu_income_diff_test`i'=corr_edu_income-corr_edu_income_pred_test`i' if test_corr_edu_income`i'==1

}

forvalues i = 1(1)21{

gen random_corr_age_urban`i'=runiform() if inclusion_corr_age_urban==1
sort random_corr_age_urban`i'
gen test_corr_age_urban`i'=1 if _n<46

qui reg corr_age_urban wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_corr_age_urban==1  & test_corr_age_urban`i'!=1
predict corr_age_urban_pred_test`i'
replace corr_age_urban_pred_test`i'=. if inclusion_corr_age_urban==0

gen corr_age_urban_diff_test`i'=corr_age_urban-corr_age_urban_pred_test`i' if test_corr_age_urban`i'==1

}


forvalues i = 1(1)21{

gen random_corr_age_income`i'=runiform() if inclusion_corr_age_income==1
sort random_corr_age_income`i'
gen test_corr_age_income`i'=1 if _n<46

qui reg corr_age_income wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_corr_age_income==1  & test_corr_age_income`i'!=1
predict corr_age_income_pred_test`i'
replace corr_age_income_pred_test`i'=. if inclusion_corr_age_income==0

gen corr_age_income_diff_test`i'=corr_age_income-corr_age_income_pred_test`i' if test_corr_age_income`i'==1

}


forvalues i = 1(1)21{

gen random_corr_urban_income`i'=runiform() if inclusion_corr_urban_income==1
sort random_corr_urban_income`i'
gen test_corr_urban_income`i'=1 if _n<46

qui reg corr_urban_income wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_corr_urban_income==1  & test_corr_urban_income`i'!=1
predict corr_urban_income_pred_test`i'
replace corr_urban_income_pred_test`i'=. if inclusion_corr_urban_income==0

gen corr_urban_income_diff_test`i'=corr_urban_income-corr_urban_income_pred_test`i' if test_corr_urban_income`i'==1

}

* Creating averages and standard deviation of differences

egen female_mean_average=mean(female_mean_difference_test1-female_mean_difference_test21)
egen edu_mean_average=mean(edu_mean_difference_test1-edu_mean_difference_test21)
egen age_mean_average=mean(age_mean_difference_test1-age_mean_difference_test21)
egen urban_mean_average=mean(urban_mean_difference_test1-urban_mean_difference_test21)
egen income_mean_average=mean(income_mean_difference_test1-income_mean_difference_test21)

egen female_sd_average=mean(female_sd_difference_test1-female_sd_difference_test1)
egen edu_sd_average=mean(edu_sd_difference_test1-edu_sd_difference_test21)
egen age_sd_average=mean(age_sd_difference_test1-age_sd_difference_test21)
egen urban_sd_average=mean(urban_sd_difference_test1-urban_sd_difference_test21)
egen income_sd_average=mean(income_sd_difference_test1-income_sd_difference_test21)

egen corr_female_edu_average=sd(corr_female_edu_diff_test1-corr_female_edu_diff_test21)
egen corr_female_age_average=sd(corr_female_age_diff_test1-corr_female_age_diff_test21)
egen corr_female_urban_average=sd(corr_female_urban_diff_test1-corr_female_urban_diff_test21)
egen corr_female_income_average=sd(corr_female_income_diff_test1-corr_female_income_diff_test21)

egen corr_edu_age_average=sd(corr_edu_age_diff_test1-corr_edu_age_diff_test21)
egen corr_edu_urban_average=sd(corr_edu_urban_diff_test1-corr_edu_urban_diff_test21)
egen corr_edu_income_average=sd(corr_edu_income_diff_test1-corr_edu_income_diff_test21)

egen corr_age_urban_average=sd(corr_age_urban_diff_test1-corr_age_urban_diff_test21)
egen corr_age_income_average=sd(corr_age_income_diff_test1-corr_age_income_diff_test21)

egen corr_urban_income_average=mean(corr_urban_income_diff_test1-corr_urban_income_diff_test21)

egen female_mean_sd=sd(female_mean_difference_test1-female_mean_difference_test21)
egen edu_mean_sd=sd(edu_mean_difference_test1-edu_mean_difference_test21)
egen age_mean_sd=sd(age_mean_difference_test1-age_mean_difference_test21)
egen urban_mean_sd=sd(urban_mean_difference_test1-urban_mean_difference_test21)
egen income_mean_sd=sd(income_mean_difference_test1-income_mean_difference_test21)

egen female_sd_sd=sd(female_sd_difference_test1-female_sd_difference_test21)
egen edu_sd_sd=sd(edu_sd_difference_test1-edu_sd_difference_test21)
egen age_sd_sd=sd(age_sd_difference_test1-age_sd_difference_test21)
egen urban_sd_sd=sd(urban_sd_difference_test1-urban_sd_difference_test21)
egen income_sd_sd=sd(income_sd_difference_test1-income_sd_difference_test21)

egen corr_female_edu_sd=sd(corr_female_edu_diff_test1-corr_female_edu_diff_test21)
egen corr_female_age_sd=sd(corr_female_age_diff_test1-corr_female_age_diff_test21)
egen corr_female_urban_sd=sd(corr_female_urban_diff_test1-corr_female_urban_diff_test21)
egen corr_female_income_sd=sd(corr_female_income_diff_test1-corr_female_income_diff_test21)

egen corr_edu_age_sd=sd(corr_edu_age_diff_test1-corr_edu_age_diff_test21)
egen corr_edu_urban_sd=sd(corr_edu_urban_diff_test1-corr_edu_urban_diff_test21)
egen corr_edu_income_sd=sd(corr_edu_income_diff_test1-corr_edu_income_diff_test21)

egen corr_age_urban_sd=sd(corr_age_urban_diff_test1-corr_age_urban_diff_test21)
egen corr_age_income_sd=sd(corr_age_income_diff_test1-corr_age_income_diff_test21)

egen corr_urban_income_sd=sd(corr_urban_income_diff_test1-corr_urban_income_diff_test21)

* Results

sum female_mean_average
sum edu_mean_average
sum age_mean_average
sum urban_mean_average
sum income_mean_average

sum female_sd_average
sum edu_sd_average
sum age_sd_average
sum urban_sd_average
sum income_sd_average

sum corr_female_edu_average
sum corr_female_age_average
sum corr_female_urban_average
sum corr_female_income_average
sum corr_edu_age_average
sum corr_edu_urban_average
sum corr_edu_income_average
sum corr_age_urban_average
sum corr_age_income_average
sum corr_urban_income_average

sum female_mean_sd
sum edu_mean_sd
sum age_mean_sd
sum urban_mean_sd
sum income_mean_sd

sum female_sd_sd
sum edu_sd_sd
sum age_sd_sd
sum urban_sd_sd
sum income_sd_sd

sum corr_female_edu_sd
sum corr_female_age_sd
sum corr_female_urban_sd
sum corr_female_income_sd
sum corr_edu_age_sd
sum corr_edu_urban_sd
sum corr_edu_income_sd
sum corr_age_urban_sd
sum corr_age_income_sd
sum corr_urban_income_sd

* (2) Out-of-sample tests (countries)
*************************************

* Loading QOG datasets

use WPO_QOG_WVS.dta, clear

* Keeping variables of interest

keep wdi_pop wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr wdi_gdpcappppcur wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 ht_region year country female_mean female_sd edu_mean edu_sd age_mean age_sd urban_mean urban_sd income_mean income_sd corr_female_edu corr_female_age corr_female_urban corr_female_income corr_edu_age corr_edu_urban corr_edu_income corr_age_urban corr_age_income corr_urban_income

* Logging variables that need to be logged, and squaring the year

gen year_sq=year^2
gen wdi_pop_log=log(wdi_pop)
gen gdp_log=log(wdi_gdpcappppcur)

* Creating general inclusion variables

gen inclusion=1
replace inclusion=0 if wdi_pop_log==.
replace inclusion=0 if wdi_popf==.
replace inclusion=0 if wdi_pop14==.
replace inclusion=0 if wdi_pop65==.
replace inclusion=0 if vdem_libdem==.
replace inclusion=0 if vdem_corr==.
replace inclusion=0 if gdp_log==.
replace inclusion=0 if wdi_tele==.
replace inclusion=0 if lp_lat_abst==.
replace inclusion=0 if wdi_popurb==.
replace inclusion=0 if lp_catho80==.
replace inclusion=0 if lp_muslim80==.
replace inclusion=0 if lp_protmg80==.


* Creating inclusion variables for means and standard deviations

gen inclusion_female=1
replace inclusion_female=0 if female_mean==.
replace inclusion_female=0 if inclusion==.

gen inclusion_edu=1
replace inclusion_edu=0 if edu_mean==.
replace inclusion_edu=0 if inclusion==.

gen inclusion_age=1
replace inclusion_age=0 if age_mean==.
replace inclusion_age=0 if inclusion==.

gen inclusion_urban=1
replace inclusion_urban=0 if urban_mean==.
replace inclusion_urban=0 if inclusion==.

gen inclusion_income=1
replace inclusion_income=0 if income_mean==.
replace inclusion_income=0 if inclusion==.

* Out-of-sample test for means


forvalues i = 1(1)21{

gen random_female_mean`i'=runiform() if inclusion_female==1
sort random_female_mean`i'
gen test_female_mean`i'=1 if _n<15
bysort country: egen test_female_mean2b`i'=max(test_female_mean`i') 

qui reg female_mean wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_female==1  & test_female_mean2b`i'!=1
predict female_mean_pred_test`i'
replace female_mean_pred_test`i'=. if inclusion_female==0

gen female_mean_difference_test`i'=female_mean-female_mean_pred_test`i' if test_female_mean2b`i'==1

}


forvalues i = 1(1)21{

gen random_edu_mean`i'=runiform() if inclusion_edu==1
sort random_edu_mean`i'
gen test_edu_mean`i'=1 if _n<15
bysort country: egen test_edu_mean2b`i'=max(test_edu_mean`i') 

qui reg edu_mean wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_edu==1  & test_edu_mean2b`i'!=1
predict edu_mean_pred_test`i'
replace edu_mean_pred_test`i'=. if inclusion_edu==0

gen edu_mean_difference_test`i'=edu_mean-edu_mean_pred_test`i' if test_edu_mean2b`i'==1

}


forvalues i = 1(1)21{

gen random_age_mean`i'=runiform() if inclusion_age==1
sort random_age_mean`i'
gen test_age_mean`i'=1 if _n<15
bysort country: egen test_age_mean2b`i'=max(test_age_mean`i') 


qui reg age_mean wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_age==1  & test_age_mean2b`i'!=1
predict age_mean_pred_test`i'
replace age_mean_pred_test`i'=. if inclusion_age==0

gen age_mean_difference_test`i'=age_mean-age_mean_pred_test`i' if test_age_mean2b`i'==1

}


forvalues i = 1(1)21{

gen random_urban_mean`i'=runiform() if inclusion_urban==1
sort random_urban_mean`i'
gen test_urban_mean`i'=1 if _n<15
bysort country: egen test_urban_mean2b`i'=max(test_urban_mean`i') 


qui reg urban_mean wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_urban==1  & test_urban_mean2b`i'!=1
predict urban_mean_pred_test`i'
replace urban_mean_pred_test`i'=. if inclusion_urban==0

gen urban_mean_difference_test`i'=urban_mean-urban_mean_pred_test`i' if test_urban_mean2b`i'==1

}

forvalues i = 1(1)21{

gen random_income_mean`i'=runiform() if inclusion_income==1
sort random_income_mean`i'
gen test_income_mean`i'=1 if _n<15
bysort country: egen test_income_mean2b`i'=max(test_income_mean`i') 


qui reg income_mean wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_income==1  & test_income_mean2b`i'!=1
predict income_mean_pred_test`i'
replace income_mean_pred_test`i'=. if inclusion_income==0

gen income_mean_difference_test`i'=income_mean-income_mean_pred_test`i' if test_income_mean2b`i'==1

}


* Out-of-sample test for standard deviations

forvalues i = 1(1)21{

gen random_female_sd`i'=runiform() if inclusion_female==1
sort random_female_sd`i'
gen test_female_sd`i'=1 if _n<15
bysort country: egen test_female_sd2b`i'=max(test_female_sd`i') 

qui reg female_sd wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_female==1  & test_female_sd2b`i'!=1
predict female_sd_pred_test`i'
replace female_sd_pred_test`i'=. if inclusion_female==0

gen female_sd_difference_test`i'=female_sd-female_sd_pred_test`i' if test_female_sd2b`i'==1

}


forvalues i = 1(1)21{

gen random_edu_sd`i'=runiform() if inclusion_edu==1
sort random_edu_sd`i'
gen test_edu_sd`i'=1 if _n<15
bysort country: egen test_edu_sd2b`i'=max(test_edu_sd`i') 


qui reg edu_sd wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_edu==1  & test_edu_sd2b`i'!=1
predict edu_sd_pred_test`i'
replace edu_sd_pred_test`i'=. if inclusion_edu==0

gen edu_sd_difference_test`i'=edu_sd-edu_sd_pred_test`i' if test_edu_sd2b`i'==1

}


forvalues i = 1(1)21{

gen random_age_sd`i'=runiform() if inclusion_age==1
sort random_age_sd`i'
gen test_age_sd`i'=1 if _n<15
bysort country: egen test_age_sd2b`i'=max(test_age_sd`i') 


qui reg age_sd wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_age==1  & test_age_sd2b`i'!=1
predict age_sd_pred_test`i'
replace age_sd_pred_test`i'=. if inclusion_age==0

gen age_sd_difference_test`i'=age_sd-age_sd_pred_test`i' if test_age_sd2b`i'==1

}


forvalues i = 1(1)21{

gen random_urban_sd`i'=runiform() if inclusion_urban==1
sort random_urban_sd`i'
gen test_urban_sd`i'=1 if _n<15
bysort country: egen test_urban_sd2b`i'=max(test_urban_sd`i') 


qui reg urban_sd wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_urban==1  & test_urban_sd2b`i'!=1
predict urban_sd_pred_test`i'
replace urban_sd_pred_test`i'=. if inclusion_urban==0

gen urban_sd_difference_test`i'=urban_sd-urban_sd_pred_test`i' if test_urban_sd2b`i'==1

}

forvalues i = 1(1)21{

gen random_income_sd`i'=runiform() if inclusion_income==1
sort random_income_sd`i'
gen test_income_sd`i'=1 if _n<15
bysort country: egen test_income_sd2b`i'=max(test_income_sd`i') 


qui reg income_sd wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_income==1  & test_income_sd`i'!=1
predict income_sd_pred_test`i'
replace income_sd_pred_test`i'=. if inclusion_income==0

gen income_sd_difference_test`i'=income_sd-income_sd_pred_test`i' if test_income_sd2b`i'==1

}


* Creating inclusion variables for correlations

gen inclusion_corr_female_edu=1
replace inclusion_corr_female_edu=0 if female_mean==. | edu_mean==.
replace inclusion_corr_female_edu=0 if inclusion==.

gen inclusion_corr_female_age=1
replace inclusion_corr_female_age=0 if female_mean==. | age_mean==.
replace inclusion_corr_female_age=0 if inclusion==.

gen inclusion_corr_female_urban=1
replace inclusion_corr_female_urban=0 if female_mean==. | urban_mean==.
replace inclusion_corr_female_urban=0 if inclusion==.

gen inclusion_corr_female_income=1
replace inclusion_corr_female_income=0 if female_mean==. | income_mean==.
replace inclusion_corr_female_income=0 if inclusion==.

gen inclusion_corr_edu_age=1
replace inclusion_corr_edu_age=0 if edu_mean==. | age_mean==.
replace inclusion_corr_edu_age=0 if inclusion==.

gen inclusion_corr_edu_urban=1
replace inclusion_corr_edu_urban=0 if edu_mean==. | urban_mean==.
replace inclusion_corr_edu_urban=0 if inclusion==.

gen inclusion_corr_edu_income=1
replace inclusion_corr_edu_income=0 if edu_mean==. | income_mean==.
replace inclusion_corr_edu_income=0 if inclusion==.

gen inclusion_corr_age_urban=1
replace inclusion_corr_age_urban=0 if age_mean==. | urban_mean==.
replace inclusion_corr_age_urban=0 if inclusion==.

gen inclusion_corr_age_income=1
replace inclusion_corr_age_income=0 if age_mean==. | income_mean==.
replace inclusion_corr_age_income=0 if inclusion==.

gen inclusion_corr_urban_income=1
replace inclusion_corr_urban_income=0 if urban_mean==. | income_mean==.
replace inclusion_corr_urban_income=0 if inclusion==.


* Out-of-sample test for correlations

forvalues i = 1(1)21{

gen random_corr_female_edu`i'=runiform() if inclusion_corr_female_edu==1
sort random_corr_female_edu`i'
gen test_corr_female_edu`i'=1 if _n<15
bysort country: egen test_corr_female_edu2b`i'=max(test_corr_female_edu`i') 


qui reg corr_female_edu wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_corr_female_edu==1  & test_corr_female_edu2b`i'!=1
predict corr_female_edu_pred_test`i'
replace corr_female_edu_pred_test`i'=. if inclusion_corr_female_edu==0

gen corr_female_edu_diff_test`i'=corr_female_edu-corr_female_edu_pred_test`i' if test_corr_female_edu2b`i'==1

}


forvalues i = 1(1)21{

gen random_corr_female_age`i'=runiform() if inclusion_corr_female_age==1
sort random_corr_female_age`i'
gen test_corr_female_age`i'=1 if _n<15
bysort country: egen test_corr_female_age2b`i'=max(test_corr_female_age`i') 


qui reg corr_female_age wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_corr_female_age==1  & test_corr_female_age2b`i'!=1
predict corr_female_age_pred_test`i'
replace corr_female_age_pred_test`i'=. if inclusion_corr_female_age==0

gen corr_female_age_diff_test`i'=corr_female_age-corr_female_age_pred_test`i' if test_corr_female_age2b`i'==1

}

forvalues i = 1(1)21{

gen random_corr_female_urban`i'=runiform() if inclusion_corr_female_urban==1
sort random_corr_female_urban`i'
gen test_corr_female_urban`i'=1 if _n<15
bysort country: egen test_corr_female_urban2b`i'=max(test_corr_female_urban`i') 


qui reg corr_female_urban wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_corr_female_urban==1  & test_corr_female_urban2b`i'!=1
predict corr_female_urban_pred_test`i'
replace corr_female_urban_pred_test`i'=. if inclusion_corr_female_urban==0

gen corr_female_urban_diff_test`i'=corr_female_urban-corr_female_urban_pred_test`i' if test_corr_female_urban2b`i'==1

}


forvalues i = 1(1)21{

gen random_corr_female_income`i'=runiform() if inclusion_corr_female_income==1
sort random_corr_female_income`i'
gen test_corr_female_income`i'=1 if _n<15
bysort country: egen test_corr_female_income2b`i'=max(test_corr_female_income`i') 


qui reg corr_female_income wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_corr_female_income==1  & test_corr_female_income2b`i'!=1
predict corr_female_income_pred_test`i'
replace corr_female_income_pred_test`i'=. if inclusion_corr_female_income==0

gen corr_female_income_diff_test`i'=corr_female_income-corr_female_income_pred_test`i' if test_corr_female_income2b`i'==1

}


forvalues i = 1(1)21{

gen random_corr_edu_age`i'=runiform() if inclusion_corr_edu_age==1
sort random_corr_edu_age`i'
gen test_corr_edu_age`i'=1 if _n<15
bysort country: egen test_corr_edu_age2b`i'=max(test_corr_edu_age`i') 


qui reg corr_edu_age wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_corr_edu_age==1  & test_corr_edu_age2b`i'!=1
predict corr_edu_age_pred_test`i'
replace corr_edu_age_pred_test`i'=. if inclusion_corr_edu_age==0

gen corr_edu_age_diff_test`i'=corr_edu_age-corr_edu_age_pred_test`i' if test_corr_edu_age2b`i'==1

}

forvalues i = 1(1)21{

gen random_corr_edu_urban`i'=runiform() if inclusion_corr_edu_urban==1
sort random_corr_edu_urban`i'
gen test_corr_edu_urban`i'=1 if _n<15
bysort country: egen test_corr_edu_urban2b`i'=max(test_corr_edu_urban`i') 


qui reg corr_edu_urban wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_corr_edu_urban==1  & test_corr_edu_urban2b`i'!=1
predict corr_edu_urban_pred_test`i'
replace corr_edu_urban_pred_test`i'=. if inclusion_corr_edu_urban==0

gen corr_edu_urban_diff_test`i'=corr_edu_urban-corr_edu_urban_pred_test`i' if test_corr_edu_urban2b`i'==1

}

forvalues i = 1(1)21{

gen random_corr_edu_income`i'=runiform() if inclusion_corr_edu_income==1
sort random_corr_edu_income`i'
gen test_corr_edu_income`i'=1 if _n<15
bysort country: egen test_corr_edu_income2b`i'=max(test_corr_edu_income`i') 


qui reg corr_edu_income wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_corr_edu_income==1  & test_corr_edu_income2b`i'!=1
predict corr_edu_income_pred_test`i'
replace corr_edu_income_pred_test`i'=. if inclusion_corr_edu_income==0

gen corr_edu_income_diff_test`i'=corr_edu_income-corr_edu_income_pred_test`i' if test_corr_edu_income2b`i'==1

}

forvalues i = 1(1)21{

gen random_corr_age_urban`i'=runiform() if inclusion_corr_age_urban==1
sort random_corr_age_urban`i'
gen test_corr_age_urban`i'=1 if _n<15
bysort country: egen test_corr_age_urban2b`i'=max(test_corr_age_urban`i') 


qui reg corr_age_urban wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_corr_age_urban==1  & test_corr_age_urban2b`i'!=1
predict corr_age_urban_pred_test`i'
replace corr_age_urban_pred_test`i'=. if inclusion_corr_age_urban==0

gen corr_age_urban_diff_test`i'=corr_age_urban-corr_age_urban_pred_test`i' if test_corr_age_urban2b`i'==1

}


forvalues i = 1(1)21{

gen random_corr_age_income`i'=runiform() if inclusion_corr_age_income==1
sort random_corr_age_income`i'
gen test_corr_age_income`i'=1 if _n<15
bysort country: egen test_corr_age_income2b`i'=max(test_corr_age_income`i') 


qui reg corr_age_income wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_corr_age_income==1  & test_corr_age_income2b`i'!=1
predict corr_age_income_pred_test`i'
replace corr_age_income_pred_test`i'=. if inclusion_corr_age_income==0

gen corr_age_income_diff_test`i'=corr_age_income-corr_age_income_pred_test`i' if test_corr_age_income2b`i'==1

}


forvalues i = 1(1)21{

gen random_corr_urban_income`i'=runiform() if inclusion_corr_urban_income==1
sort random_corr_urban_income`i'
gen test_corr_urban_income`i'=1 if _n<15
bysort country: egen test_corr_urban_income2b`i'=max(test_corr_urban_income`i') 


qui reg corr_urban_income wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq if inclusion_corr_urban_income==1  & test_corr_urban_income2b`i'!=1
predict corr_urban_income_pred_test`i'
replace corr_urban_income_pred_test`i'=. if inclusion_corr_urban_income==0

gen corr_urban_income_diff_test`i'=corr_urban_income-corr_urban_income_pred_test`i' if test_corr_urban_income2b`i'==1

}

* Creating averages and standard deviation of differences

egen female_mean_average=mean(female_mean_difference_test1-female_mean_difference_test21)
egen edu_mean_average=mean(edu_mean_difference_test1-edu_mean_difference_test21)
egen age_mean_average=mean(age_mean_difference_test1-age_mean_difference_test21)
egen urban_mean_average=mean(urban_mean_difference_test1-urban_mean_difference_test21)
egen income_mean_average=mean(income_mean_difference_test1-income_mean_difference_test21)

egen female_sd_average=mean(female_sd_difference_test1-female_sd_difference_test1)
egen edu_sd_average=mean(edu_sd_difference_test1-edu_sd_difference_test21)
egen age_sd_average=mean(age_sd_difference_test1-age_sd_difference_test21)
egen urban_sd_average=mean(urban_sd_difference_test1-urban_sd_difference_test21)
egen income_sd_average=mean(income_sd_difference_test1-income_sd_difference_test21)

egen corr_female_edu_average=sd(corr_female_edu_diff_test1-corr_female_edu_diff_test21)
egen corr_female_age_average=sd(corr_female_age_diff_test1-corr_female_age_diff_test21)
egen corr_female_urban_average=sd(corr_female_urban_diff_test1-corr_female_urban_diff_test21)
egen corr_female_income_average=sd(corr_female_income_diff_test1-corr_female_income_diff_test21)

egen corr_edu_age_average=sd(corr_edu_age_diff_test1-corr_edu_age_diff_test21)
egen corr_edu_urban_average=sd(corr_edu_urban_diff_test1-corr_edu_urban_diff_test21)
egen corr_edu_income_average=sd(corr_edu_income_diff_test1-corr_edu_income_diff_test21)

egen corr_age_urban_average=sd(corr_age_urban_diff_test1-corr_age_urban_diff_test21)
egen corr_age_income_average=sd(corr_age_income_diff_test1-corr_age_income_diff_test21)

egen corr_urban_income_average=mean(corr_urban_income_diff_test1-corr_urban_income_diff_test21)

egen female_mean_sd=sd(female_mean_difference_test1-female_mean_difference_test21)
egen edu_mean_sd=sd(edu_mean_difference_test1-edu_mean_difference_test21)
egen age_mean_sd=sd(age_mean_difference_test1-age_mean_difference_test21)
egen urban_mean_sd=sd(urban_mean_difference_test1-urban_mean_difference_test21)
egen income_mean_sd=sd(income_mean_difference_test1-income_mean_difference_test21)

egen female_sd_sd=sd(female_sd_difference_test1-female_sd_difference_test21)
egen edu_sd_sd=sd(edu_sd_difference_test1-edu_sd_difference_test21)
egen age_sd_sd=sd(age_sd_difference_test1-age_sd_difference_test21)
egen urban_sd_sd=sd(urban_sd_difference_test1-urban_sd_difference_test21)
egen income_sd_sd=sd(income_sd_difference_test1-income_sd_difference_test21)

egen corr_female_edu_sd=sd(corr_female_edu_diff_test1-corr_female_edu_diff_test21)
egen corr_female_age_sd=sd(corr_female_age_diff_test1-corr_female_age_diff_test21)
egen corr_female_urban_sd=sd(corr_female_urban_diff_test1-corr_female_urban_diff_test21)
egen corr_female_income_sd=sd(corr_female_income_diff_test1-corr_female_income_diff_test21)

egen corr_edu_age_sd=sd(corr_edu_age_diff_test1-corr_edu_age_diff_test21)
egen corr_edu_urban_sd=sd(corr_edu_urban_diff_test1-corr_edu_urban_diff_test21)
egen corr_edu_income_sd=sd(corr_edu_income_diff_test1-corr_edu_income_diff_test21)

egen corr_age_urban_sd=sd(corr_age_urban_diff_test1-corr_age_urban_diff_test21)
egen corr_age_income_sd=sd(corr_age_income_diff_test1-corr_age_income_diff_test21)

egen corr_urban_income_sd=sd(corr_urban_income_diff_test1-corr_urban_income_diff_test21)


* Results

sum female_mean_average
sum edu_mean_average
sum age_mean_average
sum urban_mean_average
sum income_mean_average

sum female_sd_average
sum edu_sd_average
sum age_sd_average
sum urban_sd_average
sum income_sd_average

sum corr_female_edu_average
sum corr_female_age_average
sum corr_female_urban_average
sum corr_female_income_average
sum corr_edu_age_average
sum corr_edu_urban_average
sum corr_edu_income_average
sum corr_age_urban_average
sum corr_age_income_average
sum corr_urban_income_average

sum female_mean_sd
sum edu_mean_sd
sum age_mean_sd
sum urban_mean_sd
sum income_mean_sd

sum female_sd_sd
sum edu_sd_sd
sum age_sd_sd
sum urban_sd_sd
sum income_sd_sd

sum corr_female_edu_sd
sum corr_female_age_sd
sum corr_female_urban_sd
sum corr_female_income_sd
sum corr_edu_age_sd
sum corr_edu_urban_sd
sum corr_edu_income_sd
sum corr_age_urban_sd
sum corr_age_income_sd
sum corr_urban_income_sd

* (3) Predicting socio-demographics
***********************************

* Loading QOG datasets

use WPO_QOG_WVS.dta, clear

* Keeping variables of interest

keep wdi_pop wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr wdi_gdpcappppcur wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 ht_region year country female_mean female_sd edu_mean edu_sd age_mean age_sd urban_mean urban_sd income_mean income_sd corr_female_edu corr_female_age corr_female_urban corr_female_income corr_edu_age corr_edu_urban corr_edu_income corr_age_urban corr_age_income corr_urban_income

* Logging variabels and squaring time

gen year_sq=year^2
gen wdi_pop_log=log(wdi_pop)
gen gdp_log=log(wdi_gdpcappppcur)


* Regressions for means and standard deviations

reg female_mean wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq
predict female_mean_sim

reg female_sd wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq
predict female_sd_sim


reg edu_mean wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq
predict edu_mean_sim

reg edu_sd wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq
predict edu_sd_sim

reg age_mean wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq
predict age_mean_sim

reg age_sd wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq
predict age_sd_sim

reg urban_mean wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq
predict urban_mean_sim

reg urban_sd wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq
predict urban_sd_sim

replace urban_sd_sim=0 if urban_sd_sim<0 // Prediction give a negative sd for Qatar and United Arab Emirate, that are just one big city.

reg income_mean wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq
predict income_mean_sim

reg income_sd wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq
predict income_sd_sim

* Regressions for correlations

reg corr_female_edu wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq
predict corr_female_edu_sim

reg corr_female_age wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq
predict corr_female_age_sim

reg corr_female_urban wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq
predict corr_female_urban_sim

reg corr_female_income wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq
predict corr_female_income_sim

reg corr_edu_age wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq
predict corr_edu_age_sim

reg corr_edu_urban wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq
predict corr_edu_urban_sim

reg corr_edu_income wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq
predict corr_edu_income_sim

reg corr_age_urban wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq
predict corr_age_urban_sim

reg corr_age_income wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq
predict corr_age_income_sim

reg corr_urban_income wdi_pop_log wdi_popf wdi_pop14 wdi_pop65 vdem_libdem vdem_corr gdp_log wdi_tele lp_lat_abst wdi_popurb lp_catho80 lp_muslim80 lp_protmg80 year year_sq
predict corr_urban_income_sim

* Dropping years before 1994 and missing values

drop if year<1994
drop if year>2020
drop if corr_age_income_sim==.

* Creating id variable

sort country year
gen country_id=_n

* Saving in matrixes


forvalues i = 1(1)3589{

mkmat female_mean_sim edu_mean_sim age_mean_sim urban_mean_sim income_mean_sim if country_id==`i', matrix(mean_`i')
mkmat female_sd_sim edu_sd_sim age_sd_sim urban_sd_sim income_sd_sim if country_id==`i', matrix(sd_`i')

mkmat corr_female_edu_sim if country_id==`i', matrix(corr1_`i')
mkmat corr_female_age_sim corr_edu_age_sim if country_id==`i', matrix(corr2_`i')
mkmat corr_female_urban_sim corr_edu_urban_sim corr_age_urban_sim if country_id==`i', matrix(corr3_`i')
mkmat corr_female_income_sim corr_edu_income_sim corr_age_income_sim corr_urban_income_sim if country_id==`i', matrix(corr4_`i')

mkmat corr_female_edu_sim corr_female_age_sim corr_female_urban_sim corr_female_income_sim if country_id==`i', matrix(corrA_`i')
mkmat corr_edu_age_sim corr_edu_urban_sim corr_edu_income_sim if country_id==`i', matrix(corrB_`i')
mkmat corr_age_urban_sim corr_age_income_sim if country_id==`i', matrix(corrC_`i')
mkmat corr_urban_income_sim if country_id==`i', matrix(corrD_`i')

matrix corr_final_`i'=(1,corrA_`i')\(corr1_`i',1,corrB_`i')\(corr2_`i', 1,corrC_`i')\(corr3_`i', 1,corrD_`i')\(corr4_`i', 1)

}

* (4) Expanding the dataset proportionnally
*******************************************

* Creating variable of number of people > 15 per country, in 10k

gen wdi_pop14_percent=wdi_pop14/100
gen wdi_pop14_absolute=wdi_pop*wdi_pop14_percent
gen pop_over15=wdi_pop-wdi_pop14_absolute
gen pop_over15_10k=pop_over15/10000

* rounding population number

replace pop_over15_10k=round(pop_over15_10k)

* Dropping uneccesary variables

drop female_sd edu_sd age_sd urban_sd income_sd corr_female_edu corr_female_age corr_female_urban corr_female_income corr_edu_age corr_edu_urban corr_edu_income corr_age_urban corr_age_income corr_urban_income female_mean_sim female_sd_sim edu_mean_sim edu_sd_sim age_mean_sim age_sd_sim urban_mean_sim urban_sd_sim income_mean_sim income_sd_sim corr_female_edu_sim corr_female_age_sim corr_female_urban_sim corr_female_income_sim corr_edu_age_sim corr_edu_urban_sim corr_edu_income_sim corr_age_urban_sim corr_age_income_sim corr_urban_income_sim 

* Saving in a new dataset

save WPO_world_data.dta, replace

* (5) Creating synthetic respondents 
************************************

use WPO_world_data.dta, clear

* Creating datasets for each country year, because the command drawnorm does not allow to do in one :(

forvalues i = 1(1)2879{
use WPO_world_data.dta, clear
keep if country_id==`i'
expand pop_over15_10k
corr2data female education age urban income, means(mean_`i') sd(sd_`i') corr(corr_final_`i')
save `i'.dta, replace

}

* Seychelles are too small to be included (2880-2907)

forvalues i = 2907(1)3589{
use WPO_world_data.dta, clear
keep if country_id==`i'
expand pop_over15_10k
corr2data female education age urban income, means(mean_`i') sd(sd_`i') corr(corr_final_`i')
save `i'.dta, replace

}


* Appending all the datasets (in several installations not to overwhelm application memory)

use 1.dta, clear


forvalues i = 2(1)1000{

append using `i'.dta, nonotes

}


forvalues i = 1001(1)2000{

append using `i'.dta, nonotes

}

forvalues i = 2001(1)2879{

append using `i'.dta, nonotes

}

* 2980 and 2906 are Seychelles. They're too small to be incldued

forvalues i = 2907(1)3589{

append using `i'.dta, nonotes

}


* Removing small countries (below 10,000)

drop if pop_over15<10000

save WPO_world_data_10k.dta, replace
